{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import jieba\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def clean_str(string):\n",
    "    \"\"\"\n",
    "    该函数的作用是去掉一个字符串中的所有非中文字符\n",
    "    :param string:\n",
    "    :return: 返回处理后的字符串\n",
    "    \"\"\"\n",
    "    string.strip('\\n')\n",
    "    string = re.sub(r\"[^\\u4e00-\\u9fff]\", \" \", string)\n",
    "    string = re.sub(r\"\\s{2,}\", \" \", string)\n",
    "    return string.strip()\n",
    "\n",
    "\n",
    "def cut_line(line):\n",
    "    \"\"\"\n",
    "    该函数的作用是 先清洗字符串，然后分词\n",
    "    :param line:\n",
    "    :return: 分词后的结果，如 ：     衣带  渐宽  终  不悔\n",
    "    \"\"\"\n",
    "    line = clean_str(line)\n",
    "    seg_list = jieba.cut(line)\n",
    "#     cut_words = \" \".join(seg_list)\n",
    "    return list(seg_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>飞扬 哥哥 晚上 好</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>请说出 韩国 最佳 女子 组合 几个</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>觉得 做 那种 人 最 幸福</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>雨涵 这次 不错</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>暗黑 破坏神 最 穿越 多久 没 更</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 text    labels\n",
       "0          飞扬 哥哥 晚上 好  positive\n",
       "1  请说出 韩国 最佳 女子 组合 几个  positive\n",
       "2      觉得 做 那种 人 最 幸福  positive\n",
       "3            雨涵 这次 不错  positive\n",
       "4  暗黑 破坏神 最 穿越 多久 没 更  negative"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('../data/training_set.csv', encoding=\"gbk\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['飞扬', '哥哥', '晚上', '好']\n",
      "['请说出', '韩国', '最佳', '女子', '组合', '几个']\n",
      "['暗黑', '破坏神', '最', '穿越', '多久', '没', '更']\n",
      "['平凡', '发现', '你们', '哪边', '人', '很', '喜欢', '吃', '粽子']\n",
      "['要', '知道', '只要', '阳光', '地方', '必然', '会', '黑暗', '往好', '看', '就行了']\n",
      "['刚刚', '试', '问', '叫', '啥', '又', '说', '回来', '说话', '还', '不行', '不会', '变通']\n",
      "['怎么', '能', '说', '自己', '机器人', '呢', '经历', '太', '坎坷', '吧', '还是', '工作', '需要', '呢']\n",
      "['明天', '想', '去', '广州', '说', '要', '穿', '个', '裙子', '去年', '还是', '她', '哥', '哎', '又', '哭泣', '呢']\n",
      "['阿狸', '歪', '脖', '达达', '兔', '歪', '脖', '但丁', '歪', '脖', '张', '小盒', '歪', '脖', '小', '纯洁', '歪', '脖']\n",
      "['想', '买', '一个', '大', '号', '垃圾桶', '洗', '干净', '装', '冰淇淋', '倒', '进', '草莓', '坐', '地板', '上', '拌', '着', '吃']\n",
      "19\n"
     ]
    }
   ],
   "source": [
    "texta = train.text.tolist()\n",
    "max_len = 0\n",
    "for i in range(len(texta)):\n",
    "    text = []\n",
    "#     text = texta[i].replace(' ', '')\n",
    "    text = texta[i].split(' ')\n",
    "    if max_len < len(text):\n",
    "        print(text)\n",
    "    max_len = max(max_len, len(text))\n",
    "print(max_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17\n"
     ]
    }
   ],
   "source": [
    "val = pd.read_csv('../data/validation_set.csv', encoding=\"gb18030\")\n",
    "texta = val.text.tolist()\n",
    "max_len = 0\n",
    "for i in range(len(texta)):\n",
    "    text = []\n",
    "#     text = texta[i].replace(' ', '')\n",
    "    text = texta[i].split(' ')\n",
    "    max_len = max(max_len, len(text))\n",
    "print(max_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                  text    labels\n",
      "11557             做 漂亮  positive\n",
      "11558           做 情人 吧  positive\n",
      "11559  做 一个 才华横溢 机器人 吧  positive\n",
      "11560         坐骑 能量 恢复  positive\n",
      "11561            坐骑 升级  positive\n",
      "(36175, 2)\n"
     ]
    }
   ],
   "source": [
    "data = pd.concat([train, val])\n",
    "print(data.tail())\n",
    "print(data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
